# Load Pandas libraries
import warnings
import pandas as pd
import numpy as np
import math
from collections import Counter
# Load scikit-learn library for K-Means
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
# Import Plot libraries
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cbook
from mpl_toolkits.mplot3d import Axes3D
%matplotlib inline
# File params
init_year = 2018
n_pc = 5
with_geo = "geo"
# Read raw data
csv_file = 'im_weekly_dpt_pivot_%s-2019_%s.csv' % (init_year, with_geo)
data_url = '../data/' + csv_file
rawdata = pd.read_csv(data_url, index_col=['Department'])
# Remove DC
remove_dc = False
if remove_dc:
rawdata = rawdata[rawdata.index != 'BOGOTA']
rawdata
# Read departments data
dataURL = '../data/col_dpt_list.csv'
dpt_data = pd.read_csv(dataURL)
# Save zone by departments
dpt_zone = {}
for ix, row in dpt_data.iterrows():
dpt = row['department']
zone = row['zone']
dpt_zone[dpt] = zone
dpt_zone
# Split data
dpt_list = pd.DataFrame(rawdata.index)
data = rawdata.reset_index(drop=True)
col_list = data.columns
# Standardize the Data
std = True
x = data.values
if std:
x = StandardScaler().fit_transform(x)
# Show department data in temporary dataframe
df_data = pd.DataFrame(data = x, columns = col_list)
df_data.head()
# Calculate department correlations
corr = df_data.corr()
# Generate a mask for the upper triangle
mask = np.zeros_like(corr, dtype = np.bool)
mask[np.triu_indices_from(mask)] = True
# Set up the matplotlib figure
fig, ax1 = plt.subplots(figsize=(18, 18))
# Generate a custom diverging colormap
cmap = sns.diverging_palette(10, 240, n = 9)
# Draw the heatmap with the mask and correct aspect ratio
sns.heatmap(corr, mask = mask, cmap = cmap, vmin = -1, vmax = 1, center = 0,
square = True, linewidths = .5, cbar_kws = {"shrink": .5})
ax1.set_xticklabels(ax1.get_xticklabels(), rotation = 45, horizontalalignment = 'right');
# Add title
ax1.set_title("Department Correlation Triangle", fontsize = 20)
plt.show()
# Set color list
color_list = ["#1f77b4", "#ff7f0e", "#d62728", "#2ca02c", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
# Palette by positions dictionary
zone_palette = dict()
zone_palette["AMAZONIA"] = color_list[0]
zone_palette["ANDINA"] = color_list[1]
zone_palette["CARIBE"] = color_list[2]
zone_palette["ORINOQUIA"] = color_list[3]
zone_palette["PACIFICO"] = color_list[4]
zone_palette["EXTERIOR"] = color_list[5]
# Principal Component Analysis
pca = PCA(n_components = 5)
pca_data = pca.fit_transform(x)
# Function that replace the player position by the zone
def add_zone(data):
data["Zone"] = data["Department"]
data["Color"] = data["Department"]
for k, v in dpt_zone.items():
data["Zone"] = data["Zone"].replace(k, v)
data["Color"] = data["Color"].replace(k, zone_palette[v])
return data
# Create and show principal components DataFrame
df_pca = pd.DataFrame(data = pca_data, columns = ["PC1", "PC2", "PC3", "PC4", "PC5"])
df_pca = pd.concat([df_pca, dpt_list], axis = 1)
df_pca = df_pca[df_pca["PC1"].notnull()]
df_pca = add_zone(df_pca)
df_pca.head(50)
# Show correlation between components
fig, ax = plt.subplots(figsize=(10, 10))
sns.heatmap(df_pca.corr(), square=True, annot=True)
ax.set_title("Correlation between Components", fontsize = 16)
plt.show()
# The explained variance tells us how much information (variance) can be attributed to each of the principal components
list(pca.explained_variance_ratio_)
# Create horizontal bar chart data
bars = ("PC1", "PC2", "PC3", "PC4", "PC5")
y_pos = np.arange(len(bars))
values = pca.explained_variance_ratio_ * 100
cum = np.cumsum(values)
# Set up the matplotlib figure
fig, ax2 = plt.subplots(figsize=(12, 10))
plt.bar(y_pos, values, align = "center", alpha = 0.7)
plt.xticks(y_pos, bars)
plt.plot(y_pos, cum, color = "orange", linewidth = 2, marker="o")
plt.title("Variance Ratio By Component", fontsize = 20)
# Add bar labels
for i, v in enumerate(cum):
ax2.text(i - .15, v + 1, (str(round(v, 1))+"%"), color = "black", fontweight = "normal", fontsize = 11)
# Plot setup
plt.xlabel("Components", fontsize = 12)
plt.ylabel("Explained variance in percent", fontsize = 12)
plt.legend(("Cum", "Var"), loc = "best")
plt.show()
# Create a matshow plot of the Principal Components dependencies
fig = plt.figure(figsize = (16, 2))
plt.matshow(pca.components_, cmap = "viridis", fignum = fig.number, aspect = "auto")
plt.yticks([0, 1, 2, 3, 4], ["PC1", "PC2", "PC3", "PC4", "PC5"], fontsize = 10)
plt.colorbar()
plt.xticks(range(len(data.columns)), data.columns, rotation = 65, ha = "left")
plt.show()
# Show the total explained variance ratio of model: Only 2 components
n_components = 2
sum(pca.explained_variance_ratio_[0:n_components]) * 100
# Eigen-vectors data
n_vectors = 2
lengths = pca.explained_variance_[0:n_vectors]
vectors = pca.components_[0:n_components, 0:n_vectors]
means = pca.mean_[0:n_vectors]
# Function to draw vectors on plane
def draw_vector(v0, v1, ax = None):
ax = ax or plt.gca()
arrowprops = dict(arrowstyle = "->", linewidth = 2, shrinkA = 0, shrinkB = 0, color = "#ff7f0e")
ax.annotate("", v1, v0, arrowprops = arrowprops)
# Create scatter plot with players label
fig, ax3 = plt.subplots(figsize=(12, 12))
# Create 2D scatter plot
plot = sns.regplot(ax=ax3, data=df_pca, x="PC1", y="PC2", fit_reg=False,
marker="o", color="#1f77b4", scatter_kws={"s": 75})
# Add annotations one by one with a loop
for ix in range(0, df_pca.shape[0]):
plot.text(df_pca.PC1[ix] + 0.2, df_pca.PC2[ix] - 0.05, df_pca.Department[ix],
horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Drawing the eigen-vectors
for length, vector in zip(lengths, vectors):
v = vector * 3 * np.sqrt(length)
draw_vector(means, means + v)
# Plot setup
ax3.set_xlabel("PC 1", fontsize = 12)
ax3.set_ylabel("PC 2", fontsize = 12)
ax3.set_title("2D - IM by Departments", fontsize = 20)
ax3.legend(["Departments"])
ax3.grid()
# Show the total explained variance ratio of model: Only 3 components
sum(pca.explained_variance_ratio_[0:3]) * 100
# Create 3D scatter plot
fig = plt.figure(figsize = (16, 16))
ax4 = fig.add_subplot(111, projection = "3d")
# Get (x, y, z) axis values
xx = df_pca.loc[:,["PC1"]].values
yy = df_pca.loc[:,["PC2"]].values
zz = df_pca.loc[:,["PC3"]].values
# Plot values
ax4.scatter(xx, yy, zz, c = "#1f77b4", marker = "o", s = 75)
# Add annotations one by one with a loop
for ix in range(0, len(x)):
ax4.text(float(xx[ix]), float(yy[ix]), float(zz[ix]), df_pca.Department[ix],
horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Plot setup
ax4.set_xlabel("PC 1", fontsize = 12)
ax4.set_ylabel("PC 2", fontsize = 12)
ax4.set_zlabel("PC 3", fontsize = 12)
ax4.set_title("3D - IM by Departments", fontsize = 20)
ax4.legend(["Departments"])
ax4.grid()
# Creating training dataset
if n_pc == 2:
n_cluster = 5
x = df_pca['PC1'].values
y = df_pca['PC2'].values
ds_train = np.array(list(zip(x, y)))
elif n_pc == 3:
n_cluster = 6
x = df_pca['PC1'].values
y = df_pca['PC2'].values
z = df_pca['PC2'].values
ds_train = np.array(list(zip(x, y, z)))
else:
n_cluster = 7
x = df_pca['PC1'].values
y = df_pca['PC2'].values
z = df_pca['PC3'].values
a = df_pca['PC4'].values
b = df_pca['PC5'].values
ds_train = np.array(list(zip(x, y, z, a, b)))
print('Nro. PCs: %s and Nro. Clusters: %s' % (n_pc, n_cluster))
# Calculating the Jambu Elbow scores
Nc = range(1, 20)
kmeans = [KMeans(n_clusters = i) for i in Nc]
score = [kmeans[i].fit(ds_train).inertia_ for i in range(len(kmeans))]
# Plot the results
fig, ax0 = plt.subplots(figsize=(14, 6))
plt.plot(Nc, score, marker='o')
plt.axvline(x=n_cluster, color="#8b0000", linestyle="--")
plt.xticks(np.arange(1, 20, 1))
plt.xlabel("Number of Clusters", fontsize = 12)
plt.ylabel("Score", fontsize = 12)
plt.title("Jambu Elbow Curve", fontsize = 20)
plt.show()
# Calculates the K-Means for (x, y) dataset
def run_kmeans(k_clusters, labels):
# Algorithms = {'auto', 'full', 'elkan'}
kmeans = KMeans(n_clusters=k_clusters, algorithm="full", random_state=0, max_iter=500)
kmeans = kmeans.fit(ds_train)
# Getting the cluster labels
clusters = kmeans.predict(ds_train)
# Centroid values
centroids = kmeans.cluster_centers_
# Plotting K-Means result
plot_kmeans_data(ds_train, k_clusters, centroids, clusters, labels)
return clusters, centroids
# Create scatter plot with K-Means data
def plot_kmeans_data(data, k_clusters, centroids, clusters, labels):
fig, ax = plt.subplots(figsize=(14, 14))
# Plotting vars
legend_list = []
n_data = len(data)
# Plot centroids
plt.scatter(centroids[:, 0], centroids[:, 1], s=25, color="gray", marker="D")
for i in range(k_clusters):
legend_list.append(mpatches.Patch(color=color_list[i], label='Cluster ' + str(i + 1)))
if labels:
ax.text(centroids[i][0] + 0.1, centroids[i][1] - 0.1, str(i + 1), fontsize=20)
# Plot data
for i in range(k_clusters):
points = np.array([data[j] for j in range(n_data) if clusters[j] == i])
sns.scatterplot(ax=ax, x=points[:, 0], y=points[:, 1], s=100, color=color_list[i])
# Add annotations one by one with a loop
if labels:
for ix in range(0, df_pca.shape[0]):
ax.text(df_pca.PC1[ix] + 0.2, df_pca.PC2[ix] - 0.1, df_pca.Department[ix],
horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Plot setup
ax.set_xlabel("PC 1", fontsize = 12)
ax.set_ylabel("PC 2", fontsize = 12)
ax.set_title("IM Clustering", fontsize = 20)
ax.grid()
plt.legend(handles=legend_list)
plt.show()
# Create interactive control to control k value
clusters, centroids = run_kmeans(k_clusters=n_cluster, labels=True)
# Create scatter plot with players label
fig, ax5 = plt.subplots(figsize=(14, 14))
# Plotting vars
legend_list = []
for k, v in zone_palette.items():
legend_list.append(mpatches.Patch(color=v, label=k))
# Create 2D scatter plot
plot = sns.regplot(ax=ax5, data=df_pca, x="PC1", y="PC2", fit_reg=False,
marker="o", scatter_kws={"s":75, "facecolors":df_pca["Color"]})
# Add annotations one by one with a loop
for ix in range(0, df_pca.shape[0]):
plot.text(df_pca.PC1[ix] + 0.2, df_pca.PC2[ix] - 0.05, df_pca.Department[ix],
horizontalalignment = "left", size = "medium", color = "black", weight = "normal")
# Plot setup
ax5.set_xlabel("PC 1", fontsize = 12)
ax5.set_ylabel("PC 2", fontsize = 12)
ax5.set_title("IM by Departments by Region", fontsize=20)
ax5.grid()
plt.legend(handles=legend_list)
plt.show()
# CLustering results dataframe
df_result = pd.DataFrame(columns=["n_item", "std_dev", "n_item_2", "w_std_dev"])
cluster_count = Counter()
for ix in clusters:
c_name = str(ix + 1)
cluster_count[c_name] += 1
# Cooking dataframe
df = pd.DataFrame.from_records(cluster_count.most_common(), columns = ['cluster', 'frequency']).sort_values(by=['cluster'])
fig, ax = plt.subplots()
df.plot.bar(ax=ax, x='cluster', y='frequency', alpha=0.75, figsize=(10, 7), color=color_list)
plt.title('Departments by Cluster', fontsize=16)
plt.xlabel('Cluster')
plt.ylabel('Frequency')
ax.get_legend().remove()
ax.grid()
plt.xticks(rotation=0)
plt.show()
cluster_data = Counter()
for ix, row in df_pca.iterrows():
cluster = str(clusters[ix] + 1)
zone = row['Zone']
if not cluster in cluster_data:
cluster_data[cluster] = Counter()
cluster_data[cluster][zone] += 1
sorted(cluster_data.items())
warnings.filterwarnings("ignore",category=matplotlib.cbook.mplDeprecation)
fig = plt.figure(figsize = (20, 4))
fig.subplots_adjust(hspace = 0.2, wspace = 0.2)
for ix in clusters:
ax = plt.subplot(1, n_cluster, ix + 1)
c_name = str(ix + 1)
data = cluster_data[str(c_name)].most_common()
df = pd.DataFrame.from_records(data, columns = ['zone', 'frequency']).sort_values(by='frequency', ascending=False)
df.plot.bar(ax=ax, x='zone', y='frequency', color=color_list[ix], alpha=0.75)
ax.get_legend().remove()
plt.title('Cluster ' + c_name, fontsize=16)
plt.xlabel('Frequency')
plt.ylabel('Zone')
plt.xticks(rotation=45)
plt.show()
dpt_cluster = {}
for i in range(n_cluster):
cluster_name = (i+1)
dpt_list = []
print('>> Cluster %s:' % cluster_name)
for j in range(len(clusters)):
if i == clusters[j]:
dpt_list.append(df_pca.iloc[j]['Department'])
dpt_cluster[cluster_name] = dpt_list
print(' ', dpt_list)
# Read raw data
data_url = "../data/im_weekly_dpt_data.csv"
dptdata = pd.read_csv(data_url, parse_dates=["date"])
dptdata
dpt_ts = {}
for cluster, departments in dpt_cluster.items():
for dpt in departments:
df = dptdata[dptdata['department'] == dpt]
ts = df.set_index('date')['value'].asfreq(freq='W')
dpt_ts[dpt] = ts
# Average a list of time series
def avg_time_series(ts_data, ts_list, allow_dbl, c_name):
avg_ts = None
df = None
n = len(ts_list)
for dpt in ts_list:
ts = ts_data[dpt]
if avg_ts is None:
avg_ts = ts
df = pd.DataFrame(ts)
else:
avg_ts = avg_ts.add(ts, fill_value=0)
df = pd.concat([df, pd.DataFrame(ts)], axis=1, sort=False)
avg_ts = avg_ts / n
if not allow_dbl:
avg_ts = math.ceil(avg_ts)
std_dev = 0
max_value = 0
if len(df.columns) > 1:
for ix, row in df.iterrows():
values = np.nan_to_num(row.values)
max_value = max(np.max(values), max_value)
std_dev += np.std(values)
std_dev = 100.0 * std_dev / len(df) / max_value
print('n time series:', n, ', std dev:', std_dev, 'max value:', max_value)
df_result.loc[c_name - 1] = [n, std_dev, n * n, n * std_dev]
return avg_ts
# Plot trends by year
def plot_cluster_curves(data, dpt_list, c_name, allow_dbl=True):
plt.figure(figsize=(18, 5), dpi=200)
# Plot all curves
for dpt in dpt_list:
ts = data[dpt]
plt.plot(ts, label=dpt, alpha=0.25)
# Plot average curve
avg_ts = avg_time_series(data, dpt_list, allow_dbl, c_name)
plt.plot(avg_ts, label='average', alpha=1, color='black')
plt.title('Cluster %s IM trends' % c_name, fontsize=16)
plt.xlabel('Date', fontsize=10)
plt.ylabel('Num. Deaths', fontsize=10)
plt.xticks(fontsize=10)
plt.legend()
plt.show()
# Plot cluster 1
c_name = 1
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 2
c_name = 2
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 3
c_name = 3
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 4
c_name = 4
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 5
if n_cluster > 4:
c_name = 5
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 6
if n_cluster > 5:
c_name = 6
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
# Plot cluster 7
if n_cluster > 6:
c_name = 7
plot_cluster_curves(dpt_ts, dpt_cluster[c_name], c_name)
df_result
cluster_avg_item = math.sqrt(np.mean(df_result['n_item_2'].values))
avg_wn_std_dev = np.sum(df_result['w_std_dev'].values) / np.sum(df_result['n_item'].values)
print('Avg Items by Cluster: %0.4f' % cluster_avg_item)
print('Avg WN Std Dev: %0.4f' % avg_wn_std_dev)
End of analysis